import pandas as pd
import numpy as np
# Time
from datetime import datetime, timedelta
# Maps and geo
import geopy.distance as dist
import folium
from folium.plugins import HeatMapWithTime
# Visualization
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
# Preprocessing
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# Models
from sklearn import cluster
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from catboost import CatBoostClassifier
# CV and optimization
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from bayes_opt import BayesianOptimization # docs are here https://github.com/fmfn/BayesianOptimization
# Metrics
from sklearn.metrics import classification_report, accuracy_score
# Feature importance
import shap
# Serialization
import joblib
# Fix random_state for experiments reproducibility
RANDOM_STATE = 42
df = pd.read_csv('robotex5.csv')
# Since we work with time data it's convenient to use sorted values
df.sort_values(by='start_time', inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 627210 entries, 0 to 627209 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 start_time 627210 non-null object 1 start_lat 627210 non-null float64 2 start_lng 627210 non-null float64 3 end_lat 627210 non-null float64 4 end_lng 627210 non-null float64 5 ride_value 627210 non-null float64 dtypes: float64(5), object(1) memory usage: 28.7+ MB
# Convert to datetime and get separately: date, hour, minute and day of the week which might help us in predictions
df['start_time'] = pd.to_datetime(df['start_time'])
df['date'] = df['start_time'].dt.date
df['hour'] = df['start_time'].dt.hour
df['minute'] = df['start_time'].dt.minute
df['day_of_the_week'] = df['start_time'].dt.dayofweek
# Make bins of 30 min intervals. I assume that 30 min is the time required to place an order, wait for pickup and complete a ride within Tallin.
# It would be nice to have actual average time of the ride, but given the data we need to make the assumption.
# The bins will be used for further demand and gain calculations.
df['start_time_bin'] = df['start_time'].apply(lambda x: x.floor('30Min'))
df['end_time_bin'] = df['start_time_bin'].apply(lambda x: x + timedelta(minutes=30))
# We can calculate the distances between given coordinates. It would be nice to use an external API (i.e. Google Maps) to have more accurate road
# distances but since it's not free we can use simple "flying" distances for now. They will be used to calculate gains.
def distance(row, lat1:str, lng1:str, lat2:str, lng2:str) -> float:
"""The function calculates distance between 2 given coordinates
using geopy package (input lat and lng must be in degrees)"""
distance = dist.geodesic((row[lat1], row[lng1]), (row[lat2], row[lng2])).km
return distance
df = df.assign(distance=df.apply(lambda x: distance(x, 'start_lat', 'start_lng', 'end_lat', 'end_lng'), axis=1))
# End coordinates have some unrealistic values, let's clean them up.
# The idea is to be aligned with start location (so coordinates within the same max, min range +- std)
df = df[(df['end_lat'] > df['start_lat'].min() - df['start_lat'].std()) &
(df['end_lat'] < df['start_lat'].max() + df['start_lat'].std()) &
(df['end_lng'] > df['start_lng'].min() - df['start_lng'].std()) &
(df['end_lng'] < df['start_lng'].max() + df['start_lng'].std())]
# Let's create a hourly grouped heat map with all starting positions in order to visually estimate potential zones
# clusters as well as possible errors. Folium library is used to create and plot the map.
def map_builder(heat:list) -> folium.Map():
"""This function builds the heatmap based on starting order locations list of lists"""
# Map settings
m = folium.Map(location=[59.44, 24.75], tiles='OpenStreetMap' , zoom_start=11) # Tallin center location
# Heat map
HeatMapWithTime(heat, auto_play=False, index=[i for i in range(len(heat))], radius=5).add_to(m)
return m
# Group lists by hours
heat = []
for i in df['hour'].unique():
# List of start coordinates lists within hour i
hour_group = df[df['hour'] == i][['start_lat', 'start_lng']].apply(lambda x: x.tolist(), axis=1).tolist()
heat.append(hour_group)
# Map
# NOTE: ADJUST THE SLIDER IN THE BOTTOM TO GET HEATMAP FOR SPECIFIC HOUR
map_builder(heat)